Assembling assays into a master MultiAssayExperiment object

Author

Laura Symul, Laura Vermeren

Published

September 30, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Clinical data (CRF data)

CRF data (raw and QCed)

Code
crf_files <- 
  get_01_output_dir() |> 
  fs::dir_ls() |> 
  str_subset("/01_")

crf_clean_file <- crf_files |> str_subset("01_crf_clean") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_raw_file <- crf_files |> str_subset("01_crf_raw") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_dict_file <- crf_files |> str_subset("01_crf_plates_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1)

load(crf_clean_file, verbose = TRUE)
Loading objects:
  crf_clean
Code
load(crf_raw_file, verbose = TRUE)
Loading objects:
  crf_raw
Code
load(crf_dict_file, verbose = TRUE)
Loading objects:
  crf_plates_dictionary
Code
rm(crf_clean_file, crf_raw_file, crf_dict_file)

Participants table

Code
load(
  crf_files |> str_subset("01_participants_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participants
Code
load(
  crf_files |> str_subset("01_participants_variable_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participants_variable_dictionary
Code
load(
  crf_files |> str_subset("01_participant_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participant_crfs_merged

Visits table

Code
load(
  crf_files |> str_subset("01_visits_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits
Code
load(
  crf_files |> str_subset("01_visits_long_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits_long
Code
load(
  crf_files |> str_subset("01_visits_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits_crfs_merged
Code
visits |> 
  ggplot() +
  aes(x = study_day, y = pid |> factor(), color = visit_code |> factor()) +
  geom_point() 

We define the uid (unique identifier):

Code
visits <- 
  visits |> 
  mutate(
    uid = str_c(pid, visit_code, sep = "_")
  ) |> 
  select(uid, everything())

Exposures table

Code
load(
  crf_files |> str_subset("01_exposures_") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  exposures

Assays: loading the SummarizedExperiment objects

Code
data_dir <- get_01_output_dir()
se_files <- fs::dir_ls(data_dir, regexp = "se_.*\\.rds$") |> sort(decreasing = TRUE)

Metagenomics data

Code
mg_file <- se_files[grepl("mg", se_files)][1]
se_mg <- readRDS(mg_file)
se_mg <- check_se(se_mg)
se_mg
# A SummarizedExperiment-tibble abstraction: 775,884 × 58
# Features=779 | Samples=996 | Assays=count, count_corr, rel_ab_all,
#   rel_ab_bact, rel_ab
   .feature .sample  count count_corr rel_ab_all rel_ab_bact rel_ab uid   mg_uid
   <chr>    <chr>    <dbl>      <dbl>      <dbl>       <dbl>  <dbl> <chr> <chr> 
 1 C0022A1  0681000…     0          0          0           0      0 0681… MG_202
 2 C0059E1  0681000…     0          0          0           0      0 0681… MG_202
 3 C0175A1  0681000…     0          0          0           0      0 0681… MG_202
 4 FF00018  0681000…     0          0          0           0      0 0681… MG_202
 5 FF00051  0681000…     0          0          0           0      0 0681… MG_202
 6 UC101    0681000…     0          0          0           0      0 0681… MG_202
 7 122010   0681000…     0          0          0           0      0 0681… MG_202
 8 185329   0681000…     0          0          0           0      0 0681… MG_202
 9 C0006A1  0681000…     0          0          0           0      0 0681… MG_202
10 C0028A1  0681000…     0          0          0           0      0 0681… MG_202
# ℹ 40 more rows
# ℹ 49 more variables: sample_type <fct>, control_type <fct>,
#   total_non_human_reads <dbl>, rel_ab_multi_genera <dbl>,
#   poor_quality_mg_data <lgl>, cst <chr>, sub_cst <chr>, valencia_score <dbl>,
#   swab_barcode <chr>, mg_pid <chr>, mg_visit_code <chr>,
#   visit_code_flagged <lgl>, mg_sample_type <chr>, ext_lib_plate_nb <dbl>,
#   ext_lib_plate_id <dbl>, ext_lib_position <chr>, …

qPCR data

Code
qPCR_file <- se_files[grepl("03_se_pcr_agg", se_files)][1]
se_qPRC <- readRDS(qPCR_file)
se_qPRC <- check_se(se_qPRC)
se_qPRC
# A SummarizedExperiment-tibble abstraction: 16,464 × 23
# Features=16 | Samples=1029 | Assays=n_non_na, dilution, copies_per_swab_med,
#   copies_per_swab_mean, copies_per_swab_cv
   .feature .sample   n_non_na dilution copies_per_swab_med copies_per_swab_mean
   <chr>    <chr>        <int>    <dbl>               <dbl>                <dbl>
 1 C0175A1  06810000…        3        5                   0                    0
 2 C0022A1  06810000…        3        5                   0                    0
 3 C0059E1  06810000…        3        5                   0                    0
 4 UC101    06810000…        3        5                   0                    0
 5 FF00018  06810000…        3        5                   0                    0
 6 FF00051  06810000…        3        5                   0                    0
 7 185329   06810000…        3       20                   0                    0
 8 C0028A1  06810000…        3       20                   0                    0
 9 C0112A1  06810000…        3       20                   0                    0
10 FF00004  06810000…        3       20                   0                    0
# ℹ 310 more rows
# ℹ 17 more variables: copies_per_swab_cv <dbl>, uid <chr>,
#   qpcr_sample_type <fct>, sample_type <chr>, control_type <chr>,
#   qpcr_sample_id <chr>, vibr_sample_id <chr>, ext_lib_plate_nb <chr>,
#   target <fct>, fluor <chr>, strain_group_qpcr <chr>, taxon_label <chr>,
#   LBP <fct>, strain_id <fct>, strain_origin <fct>, biose_id <chr>,
#   pcr_plate_ids <chr>

qPCR data for daily swabs (generated post-unblinding)

Code
daily_qPCR_file <- se_files[grepl("30_se_pcr_agg", se_files)][1]
se_daily_qPRC <- readRDS(daily_qPCR_file)
se_daily_qPRC <- check_se(se_daily_qPRC)
se_daily_qPRC
# A SummarizedExperiment-tibble abstraction: 47,840 × 24
# Features=16 | Samples=2990 | Assays=n_non_na, dilution, copies_per_swab_med,
#   copies_per_swab_mean, copies_per_swab_cv
   .feature .sample   n_non_na dilution copies_per_swab_med copies_per_swab_mean
   <chr>    <chr>        <int>    <dbl>               <dbl>                <dbl>
 1 C0175A1  06810000…        3       10                  0                    0 
 2 C0022A1  06810000…        3       10                  0                    0 
 3 C0059E1  06810000…        3       10                  0                    0 
 4 UC101    06810000…        3       10                568.                1170.
 5 FF00018  06810000…        3       10               2436.                2069.
 6 FF00051  06810000…        3       10               1625.                1730.
 7 185329   06810000…        3       10                  0                    0 
 8 C0028A1  06810000…        3       10                  0                    0 
 9 C0112A1  06810000…        3       10                  0                    0 
10 FF00004  06810000…        3       10                  0                    0 
# ℹ 310 more rows
# ℹ 18 more variables: copies_per_swab_cv <dbl>, uid <chr>,
#   qpcr_sample_type <chr>, sample_type <chr>, control_type <chr>,
#   qpcr_sample_id <chr>, vibr_sample_id <chr>, pcr_plate_id <chr>,
#   pcr_plate_nb <chr>, ext_lib_plate_nb <chr>, target <fct>, fluor <chr>,
#   strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>,
#   strain_origin <fct>, biose_id <chr>

16S rRNA amplicon sequencing

Code
amplicon_file <- se_files[grepl("16S_agg", se_files)][1]
se_16S <- readRDS(amplicon_file)
se_16S <- check_se(se_16S)
se_16S
# A SummarizedExperiment-tibble abstraction: 3,365,223 × 48
# Features=817 | Samples=4119 | Assays=counts, rel_ab
   .feature  .sample counts rel_ab uid   exclude_sample total_counts sample_type
   <chr>     <chr>    <dbl>  <dbl> <chr> <chr>                 <dbl> <chr>      
 1 Lactobac… 068100…   6904 0.0933 0681… No                    74008 Clinical s…
 2 Lactobac… 068100…      0 0      0681… No                    74008 Clinical s…
 3 Gardnere… 068100…   8284 0.112  0681… No                    74008 Clinical s…
 4 Ca_Lachn… 068100…      0 0      0681… No                    74008 Clinical s…
 5 Sneathia… 068100…  21143 0.286  0681… No                    74008 Clinical s…
 6 Fannyhes… 068100…   1601 0.0216 0681… No                    74008 Clinical s…
 7 Sneathia… 068100…      0 0      0681… No                    74008 Clinical s…
 8 Prevotel… 068100…   1963 0.0265 0681… No                    74008 Clinical s…
 9 Prevotel… 068100…      0 0      0681… No                    74008 Clinical s…
10 f_Dethio… 068100…   3629 0.0490 0681… No                    74008 Clinical s…
# ℹ 40 more rows
# ℹ 40 more variables: control_type <chr>, sample_id <chr>,
#   sample_id_barcode <chr>, patient_id <chr>, visit_code_ps <chr>,
#   idt_adapter_name <chr>, i5_index_forward <chr>, i5_index_reverse <chr>,
#   i7_index_reverse <chr>, i7_index <chr>, sample_id_2 <chr>,
#   total_spike_in_rel <dbl>, total_spike_in_counts <dbl>, cst <chr>,
#   sub_cst <chr>, score <dbl>, specimen_type <chr>, well_id <chr>, …

Luminex data

Code
luminex_file <- se_files[grepl("luminex_agg", se_files)][1]
se_luminex <- readRDS(luminex_file)
se_luminex <- check_se(se_luminex)
se_luminex
# A SummarizedExperiment-tibble abstraction: 40,800 × 37
# Features=48 | Samples=850 | Assays=obs_conc, obs_conc_imp, value_type,
#   value_types, sd_log10_obs_conc_imp, obs_conc_log10, obs_conc_imp_log10
   .feature  .sample        obs_conc obs_conc_imp value_type         value_types
   <chr>     <chr>             <dbl>        <dbl> <chr>              <chr>      
 1 CTACK     068100004_0000    207.         207.  Observed concentr… Observed c…
 2 Eotaxin   068100004_0000     90.7         90.7 Observed concentr… Observed c…
 3 FGF basic 068100004_0000    526.         526.  Observed concentr… Observed c…
 4 G-CSF     068100004_0000    767.         767.  Observed concentr… Observed c…
 5 GM-CSF    068100004_0000     64.2         64.2 Observed concentr… Observed c…
 6 GRO-a     068100004_0000   3001.        3001.  Observed concentr… Observed c…
 7 HGF       068100004_0000   3805.        3805.  Observed concentr… Observed c…
 8 IFN-a2    068100004_0000    328.         328.  Observed concentr… Observed c…
 9 IFN-g     068100004_0000   4074.        4074.  Observed concentr… Observed c…
10 IL-10     068100004_0000     33.2         33.2 Observed concentr… Observed c…
# ℹ 38 more rows
# ℹ 31 more variables: sd_log10_obs_conc_imp <dbl>, obs_conc_log10 <dbl>,
#   obs_conc_imp_log10 <dbl>, uid <chr>, sample_id <chr>, visit <dbl>,
#   weight <dbl>, volume <dbl>, approx_volume_based_on_aliquot_repos <dbl>,
#   aliquot_volume <dbl>, dilution_200ul_tube <dbl>,
#   volume_added_for_assay <dbl>, dilution_manifest <dbl>, dilution_new <dbl>,
#   shorthand_title <chr>, alternate_title <chr>, …

Flow cytometry data

Code
flow_file <- se_files[grepl("flow", se_files)][1]
se_flow <- readRDS(flow_file)
se_flow <- check_se(se_flow)
se_flow
# A SummarizedExperiment-tibble abstraction: 3,916 × 15
# Features=11 | Samples=356 | Assays=count, percentage
   .feature     .sample        count percentage uid   sample machine sample_type
   <chr>        <chr>          <dbl>      <dbl> <chr> <chr>  <chr>   <chr>      
 1 Live         068100004_10… 2.16e6      NA    0681… Speci… 5R      Clinical s…
 2 Granulocytes 068100004_10… 2.98e4       1.38 0681… Speci… 5R      Clinical s…
 3 B_cells      068100004_10… 3.68e2      NA    0681… Speci… 5R      Clinical s…
 4 APC          068100004_10… 7.88e2      NA    0681… Speci… 5R      Clinical s…
 5 CD40         068100004_10… 4.06e2      51.5  0681… Speci… 5R      Clinical s…
 6 CD80         068100004_10… 3.82e2      48.5  0681… Speci… 5R      Clinical s…
 7 CD86         068100004_10… 4.92e2      62.4  0681… Speci… 5R      Clinical s…
 8 CD4          068100004_10… 4.13e3      NA    0681… Speci… 5R      Clinical s…
 9 CD8          068100004_10… 1.48e3      NA    0681… Speci… 5R      Clinical s…
10 DblePosCD4   068100004_10… 1.58e2       3.82 0681… Speci… 5R      Clinical s…
# ℹ 210 more rows
# ℹ 7 more variables: control_type <chr>, pid_assay <chr>,
#   visit_code_assay <chr>, site_assay <chr>, cell_type <chr>,
#   parent_cell_type <fct>, cell_type_label <fct>

Creating the MultiAssayExperiment object

Assay list

Code
assay_list <- 
  bind_rows(
    tibble(
      mae_assay_name = "mg", 
      se_object_name = "se_mg", 
      assay_label = "Metagenomic"
      ),
    tibble(
      mae_assay_name = "qPCR", 
      se_object_name = "se_qPRC", 
      assay_label = "qPCR weekly swabs"
      ),
    tibble(
      mae_assay_name = "qPCR_daily", 
      se_object_name = "se_daily_qPRC", 
      assay_label = "qPCR home swabs"
      ),
    tibble(
      mae_assay_name = "amplicon", 
      se_object_name = "se_16S", 
      assay_label = "16S rRNA amplicon seq."
      ),
    tibble(
      mae_assay_name = "luminex", 
      se_object_name = "se_luminex", 
      assay_label = "Luminex"
      ),
    tibble(
      mae_assay_name = "flow", 
      se_object_name = "se_flow", 
      assay_label = "Flow cytometry"
      )
  ) |> 
  mutate(
    assay_label = assay_label |> fct_inorder()
  )

We will assemble the following tables (SE objects):

Code
assay_list |> gt()
mae_assay_name se_object_name assay_label
mg se_mg Metagenomic
qPCR se_qPRC qPCR weekly swabs
qPCR_daily se_daily_qPRC qPCR home swabs
amplicon se_16S 16S rRNA amplicon seq.
luminex se_luminex Luminex
flow se_flow Flow cytometry

mae_coldata_assays tables

Code
available_samples <- 
  map(
    1:nrow(assay_list),
    function(i, assay_list){
      se <- get(assay_list$se_object_name[i])
      tibble(
        assay = assay_list$mae_assay_name[i],
        se_colname = se |> colnames(), 
        uid = se$uid, 
        sample_type = se$sample_type,
        control_type = se$control_type,
        )
    },
    assay_list = assay_list
  ) |> 
  bind_rows()

if (!all(available_samples$se_colname == available_samples$uid, na.rm = TRUE)) {
  stop("The uid column in the SE objects does not match their colnames")
}

mae_coldata_assays <-
  available_samples |>
  group_by(uid) |>
  summarize(
    n_sample_type = n_distinct(sample_type),
    n_control_type = n_distinct(control_type),
    sample_type = sample_type[1],
    control_type = control_type[1],
    assays = str_c(assay, collapse = ", ")
  )

if (any(mae_coldata_assays$n_sample_type > 1)) {
  warning("Some samples have multiple sample types")
}

if (any(mae_coldata_assays$n_control_type > 1)) {
  warning("Some samples have multiple control types")
}

mae_coldata_assays <- 
  mae_coldata_assays |> 
  select(uid, sample_type, control_type, assays) 

The mae_coldata_assays table contains the following columns:

Code
mae_coldata_assays |> colnames()
[1] "uid"          "sample_type"  "control_type" "assays"      
Code
mae_coldata_assays |> str()
tibble [4,185 × 4] (S3: tbl_df/tbl/data.frame)
 $ uid         : chr [1:4185] "068100004_0000" "068100004_1000" "068100004_1001" "068100004_1002" ...
 $ sample_type : chr [1:4185] "Clinical sample" "Clinical sample" "Clinical sample" "Clinical sample" ...
 $ control_type: chr [1:4185] "" "" "" "" ...
 $ assays      : chr [1:4185] "mg, qPCR, amplicon, luminex" "mg, qPCR, amplicon, luminex, flow" "qPCR_daily, amplicon" "qPCR_daily, amplicon" ...
Code
mae_coldata_assays |> 
  dplyr::count(sample_type, control_type) |> 
  gt("Number of samples per sample type and control type") 
sample_type control_type n
Clinical sample 3962
Clinical sample (other study) 7
Negative control Nuclease-free water 46
Negative control Unused swab + C2 47
Positive control Mock 1 57
Positive control Mock 2 57
Test sample 9

We also remove the columns sample_type and control_type from the colData of the individual assays to avoid conflicts when joining with the mae@colData table in downstream analyses.

Code
for (i in 1:nrow(assay_list)) {
  se <- get(assay_list$se_object_name[i])
  if (("sample_type" %in% colnames(colData(se))) | ("control_type" %in% colnames(colData(se)))) {
    colData(se) <- colData(se)[, !colnames(colData(se)) %in% c("sample_type", "control_type")]
  }
  assign(assay_list$se_object_name[i], se)
}

MAE@colData: joining the visits and mae_coldata_assays tables

We first do a full join of the visits and mae_coldata_assays tables by uid. This will allow us to keep all visits, even those that do not have assay data, and to identify which visits are in the CRF data and which are in the assay data.

Code
mae_coldata <- 
  # we first do a full join of the CRF visits and mae_coldata_assays by uids;
  dplyr::full_join(
    visits |> select(-starts_with("specimen_collection")) |> mutate(in_CRF = TRUE),
    mae_coldata_assays |> mutate(in_assays = TRUE) |> select(in_assays, everything()),
    by = join_by(uid)
  ) |> 
  mutate(
    in_CRF = in_CRF |> replace_na(FALSE),
    in_assays = in_assays |> replace_na(FALSE)
  )
Code
# We notice that there are 5 "Clinical sample" entries for which we do have assay data but that did not appear in the CRF data:

mae_coldata |> 
  filter(is.na(visit_code), sample_type == "Clinical sample") 
  
# we fix these below
Code
mae_coldata <- 
  mae_coldata |> 
  mutate(
    needs_fixing = is.na(pid) & (sample_type == "Clinical sample") & str_detect(uid, "[0-9]{9}_[0-9]{4}"),
    pid = 
      case_when(
        needs_fixing ~ str_remove(uid, "_[0-9]{4}"),
        TRUE ~ pid
      ),
    visit_code =
      case_when(
        needs_fixing ~ str_remove(uid, "[0-9]{9}_"),
        TRUE ~ visit_code
      ),
    study_day = 
      case_when(
        needs_fixing & (visit_code %in% c("0001", "0011")) ~ -7,
        needs_fixing & in_assays & (visit_code == "1000") ~ 0,
        TRUE ~ study_day
      ),
    visit_attended = 
      case_when(
        is.na(visit_attended) & needs_fixing ~ TRUE,
        TRUE ~ visit_attended
      ),
    visit_planned = 
      case_when(
        is.na(visit_planned) & needs_fixing ~ "Unplanned visit",
        TRUE ~ visit_planned
      ),
    visit_type = 
      case_when(
        is.na(visit_type) & needs_fixing ~ "Clinic",
        TRUE ~ visit_type
      )
  ) 

We then merge that table with the participants table to add the participant-invariant information (e.g., randomized, mITT, etc.) to the mae_coldata table.

Code
# we add the participant-invariant information from the `participants` table
mae_coldata <- 
  mae_coldata |> 
  dplyr::left_join(
    participants,
    by = join_by(pid)
  )
Code
mae_coldata <- mae_coldata |> select(-needs_fixing)
Code
mae_coldata |> 
  dplyr::count(in_CRF, randomized, in_assays, sample_type, assays) |> 
  gt()
in_CRF randomized in_assays sample_type assays n
FALSE FALSE TRUE Clinical sample mg, qPCR, amplicon, flow 2
FALSE FALSE TRUE Clinical sample qPCR_daily, amplicon 1
FALSE NA TRUE Clinical sample (other study) mg, qPCR 7
FALSE NA TRUE Negative control amplicon 70
FALSE NA TRUE Negative control mg, qPCR 12
FALSE NA TRUE Negative control qPCR 11
FALSE NA TRUE Positive control amplicon 91
FALSE NA TRUE Positive control mg 1
FALSE NA TRUE Positive control mg, qPCR 10
FALSE NA TRUE Positive control qPCR 12
FALSE NA TRUE Test sample qPCR 8
FALSE NA TRUE Test sample qPCR_daily 1
TRUE FALSE FALSE NA NA 414
TRUE FALSE TRUE Clinical sample mg, qPCR, amplicon 66
TRUE FALSE TRUE Clinical sample mg, qPCR, amplicon, luminex 5
TRUE FALSE TRUE Clinical sample qPCR, amplicon 1
TRUE TRUE FALSE NA NA 864
TRUE TRUE TRUE Clinical sample amplicon 4
TRUE TRUE TRUE Clinical sample flow 1
TRUE TRUE TRUE Clinical sample luminex, flow 1
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon 49
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, flow 3
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, luminex 491
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, luminex, flow 346
TRUE TRUE TRUE Clinical sample mg, qPCR, luminex 2
TRUE TRUE TRUE Clinical sample mg, qPCR, qPCR_daily, amplicon, luminex, flow 2
TRUE TRUE TRUE Clinical sample qPCR, amplicon, luminex 1
TRUE TRUE TRUE Clinical sample qPCR, amplicon, luminex, flow 1
TRUE TRUE TRUE Clinical sample qPCR_daily, amplicon 2985
TRUE TRUE TRUE Clinical sample qPCR_daily, amplicon, luminex 1

Imputing study_day for daily visit_code that did not have CRF data

There were a few entries with daily visit_code that did not have a date in the CRF data. Consequently they also did not have study days (study_day) assigned.

Code
mae_coldata |> 
  filter(
    is.na(study_day), 
    is.na(sample_type) | (sample_type == "Clinical sample")
    ) |> 
  dplyr::count(visit_type, assays, in_CRF, visit_attended, sample_type)
# A tibble: 5 × 6
  visit_type assays               in_CRF visit_attended sample_type         n
  <chr>      <chr>                <lgl>  <lgl>          <chr>           <int>
1 Clinic     <NA>                 TRUE   FALSE          <NA>               38
2 Home       qPCR_daily, amplicon TRUE   FALSE          Clinical sample   100
3 Home       qPCR_daily, amplicon TRUE   TRUE           Clinical sample     3
4 Home       <NA>                 TRUE   FALSE          <NA>              540
5 Home       <NA>                 TRUE   TRUE           <NA>                7

We impute the missing values using the study_days from “surrounding” visits.

Code
# first, we create a "dictionary" with the most common study_days for each daily swab visit code
study_days_for_daily_codes <- 
  mae_coldata |> 
  dplyr::filter(visit_type == "Home") |> 
  dplyr::count(visit_code, study_day) |> 
  arrange(visit_code, -n) |> 
  group_by(visit_code) |> 
  slice_head(n = 1) |> 
  ungroup() |> 
  dplyr::rename(exp_study_day = study_day) |> 
  select(-n)
Code
study_days_for_daily_codes |> 
  ggplot(aes(x = exp_study_day, y = visit_code)) +
  geom_point()
Code
# In the plot above, we notice that visits 1501:1503 did not have "expected" study day, so we create it by expanding the time series
study_days_for_daily_codes <-
  study_days_for_daily_codes |> 
  mutate(
    exp_study_day = 
      case_when(
        is.na(exp_study_day) & (visit_code %in% c(1501:1503)) ~ 
          as.numeric(visit_code) - 1501 + 36,
        TRUE ~ exp_study_day
      )
  )
Code
# visits from the participants with missing study_days
pid_with_missing_daily_study_days <- 
  # we first filter to retrieve the pids
  mae_coldata |> 
  filter(is.na(study_day)) |>  # , in_assays
  select(pid) |> distinct() |> 
  # we join with the coldata
  left_join(mae_coldata, by = join_by(pid)) |> 
  arrange(pid, visit_code) |> 
  # we join the study_day dictionary
  left_join(study_days_for_daily_codes) |> 
  mutate(diff = study_day - exp_study_day) |> # the typical difference for a participant between their study_day and the "expected" study_day (some participant started sampling at home one day early or late)
  group_by(pid) |> 
  fill(diff, .direction = "downup") |>  # impute when missing
  mutate(diff = ifelse(all(is.na(diff)), 0, diff)) |> # if still missing,we set to 0
  ungroup() |> 
  # we fix the study_day
  mutate(
    fixed_study_day = 
      case_when(
        is.na(study_day) & !is.na(exp_study_day) ~ exp_study_day + diff,
        TRUE ~ study_day
      )
  ) |> 
  select(uid, pid, visit_code, study_day, fixed_study_day, exp_study_day, diff, in_CRF, in_assays, crf_plates, assays)

# pid_with_missing_daily_study_days |> View()
Code
mae_coldata <- 
  mae_coldata |> 
  left_join(
    pid_with_missing_daily_study_days |> select(uid, fixed_study_day), 
    by = join_by(uid)
    ) |> 
  mutate(
    study_day =
      case_when(
        is.na(study_day) ~ fixed_study_day,
        TRUE ~ study_day
      )
  ) |> 
  select(-fixed_study_day)
Code
mae_coldata |> 
  filter(is.na(study_day), in_assays) |> 
  View()

# all is fixed

Creating the visit, visit_number, and study_week columns

Code
visit_dict <- 
  bind_rows(
    tibble(
      visit = "Screening", 
      definition = "Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one.", 
      possible_visit_codes = "0000, 0001, 0010, 0011",
      visit_number = "V0",
      study_week = -1
    ),
    tibble(
      visit = "Add. screening", 
      definition = 'If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit.', 
      possible_visit_codes = "0000, 0001, 0010, 0011",
      visit_number = "V0b",
      study_week = -2
      ),
    tibble(
      visit = "Day 0 (pre-MTZ)", 
      definition = "Visit before the metronidazole treatment (MTZ) starts.", 
      possible_visit_codes = "1000, 1011",
      visit_number = "V1",
      study_week = 0
      ),
    tibble(
      visit = "Week 1 (post-MTZ)", 
      definition = "Visit after the metronidazole treatment (MTZ) ends.", 
      possible_visit_codes = "1100",
      visit_number = "V2",
      study_week = 1
      ),
    tibble(
      visit = "Week 2 (post-SP)", 
      definition = "Visit after the study product (SP) ends.", 
      possible_visit_codes = "1200, 1211",
      visit_number = "V3",
      study_week = 2
      ),
    tibble(
      visit = "Week 3", 
      definition = "Visit one week after the study product (SP) ends.", 
      possible_visit_codes = "1300",
      visit_number = "V4",
      study_week = 3
      ),
    tibble(
      visit = "Week 4", 
      definition = "Visit two weeks after the study product (SP) ends.", 
      possible_visit_codes = "1400",
      visit_number = "V5",
      study_week = 4
      ),
    tibble(
      visit = "Week 5", 
      definition = "Visit three weeks after the study product (SP) ends.", 
      possible_visit_codes = "1500, 1511, 1512",
      visit_number = "V6",
      study_week = 5
      ),
    tibble(
      visit = "Week 7", 
      definition = "Visit five weeks after the study product (SP) ends.", 
      possible_visit_codes = "1700, 1711",
      visit_number = "V7",
      study_week = 7
      ),
    tibble(
      visit = "Week 9", 
      definition = "Visit seven weeks after the study product (SP) ends.", 
      possible_visit_codes = "1900, 1911",
      visit_number = "V8",
      study_week = 9
      ),
    tibble(
      visit = "Week 12", 
      definition = "Visit ten weeks after the study product (SP) ends.", 
      possible_visit_codes = "2120, 2121, 2122",
      visit_number = "V9",
      study_week = 12
    )
  ) |> 
  mutate(
    visit = visit |> fct_reorder(study_week)
  )
Code
visit_dict |> 
  gt() |> 
  tab_header(
    title = "Visit codes and definitions"
  ) |> 
  cols_label(
    visit = "Visit",
    definition = "Definition",
    possible_visit_codes = "Possible visit codes",
    visit_number = "Visit number",
    study_week = "Study week"
  ) |> 
  cols_align(
    align = "left", columns = c(visit, definition, possible_visit_codes)
  )
Visit codes and definitions
Visit Definition Possible visit codes Visit number Study week
Screening Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one. 0000, 0001, 0010, 0011 V0 -1
Add. screening If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit. 0000, 0001, 0010, 0011 V0b -2
Day 0 (pre-MTZ) Visit before the metronidazole treatment (MTZ) starts. 1000, 1011 V1 0
Week 1 (post-MTZ) Visit after the metronidazole treatment (MTZ) ends. 1100 V2 1
Week 2 (post-SP) Visit after the study product (SP) ends. 1200, 1211 V3 2
Week 3 Visit one week after the study product (SP) ends. 1300 V4 3
Week 4 Visit two weeks after the study product (SP) ends. 1400 V5 4
Week 5 Visit three weeks after the study product (SP) ends. 1500, 1511, 1512 V6 5
Week 7 Visit five weeks after the study product (SP) ends. 1700, 1711 V7 7
Week 9 Visit seven weeks after the study product (SP) ends. 1900, 1911 V8 9
Week 12 Visit ten weeks after the study product (SP) ends. 2120, 2121, 2122 V9 12

We first determine, for each participant, which is their main vs. their additional screening visit.

Code
mae_coldata <- 
  mae_coldata |> 
  group_by(pid) |> 
  mutate(
    is_screening_visit = 
      case_when(
        visit_code %in% c("0000", "0001", "0010", "0011") ~ TRUE,
        TRUE ~ FALSE
      ),
    n_assays = ifelse(in_assays, str_split(assays, ", ") |> lengths(), 0),
    screening_visit_score = is_screening_visit * (n_assays + study_day/10)
  ) |> 
  arrange(-screening_visit_score) |> 
  mutate(
    screening_visit = 
      case_when(
        is_screening_visit & (row_number() == 1) ~ "main screening",
        is_screening_visit & (row_number() > 1) ~ "additional screening",
        TRUE ~ NA_character_
      )
  ) |> 
  ungroup() |> 
  select(-is_screening_visit, -n_assays, -screening_visit_score)
Code
mae_coldata |> 
  dplyr::filter(screening_visit == "additional screening") |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  View()

mae_coldata |> 
  dplyr::filter(screening_visit == "additional screening") |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  dplyr::filter(randomized, (screening_visit == "main screening"), study_day == 0) |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, mITT, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  View()


# > check 068200020
# unclear - her main screening visit could be at day -3, but it might be at day 0, which would mean that her screening visit and pre-MTZ visit samples were collected on the same day
crf_raw$crf35 |> dplyr::filter(pid == "068200020") |> View()
Code
mae_coldata |> 
  filter(!is.na(screening_visit)) |> 
  dplyr::count(screening_visit) 
# A tibble: 2 × 2
  screening_visit          n
  <chr>                <int>
1 additional screening    66
2 main screening         532

We next join the mae_coldata table with the visit_dict table to add the visit, visit_number, and study_week columns.

Code
mae_coldata <- 
  mae_coldata |> 
  select(-any_of(c("PIPV", "visit", "visit_number", "study_week"))) |>
  left_join(
    visit_dict |> 
      mutate(visit_code = possible_visit_codes |> str_split(", ")) |>
      unnest(visit_code) |>
      mutate(
        screening_visit = 
          case_when(
            visit_number == "V0b" ~ "additional screening",
            visit_number == "V0" ~ "main screening",
            TRUE ~ NA_character_
            ),
        PIPV = visit_number != "V0b"
        ) |> 
      select(visit_code, screening_visit, PIPV, visit, visit_number, study_week)
  ) |> 
  mutate(PIPV = PIPV |> replace_na(FALSE)) |> 
  relocate(
    PIPV, visit, visit_number, study_week, .after = "visit_code"
  ) 

We check that study_days are unique for each visit_number at which assay samples were collected for each participant.

Code
mae_coldata |> 
  group_by(pid, visit_number) |> 
  mutate(n_study_day = n_distinct(study_day[!is.na(assays)])) |>
  ungroup() |> 
  filter(PIPV, n_study_day > 1) |> 
  select(uid, pid, visit_code, study_day, assays) |> 
  group_by(pid) |> 
  gt(row_group_as_column = TRUE)
uid visit_code study_day assays
068200062 068200062_1500 1500 37 mg, qPCR, amplicon
068200062_1511 1511 44 luminex, flow
068200087 068200087_1500 1500 40 mg, qPCR, amplicon, luminex
068200087_1511 1511 43 flow

There are just two participants for which the dates don’t match and the different assays were collected at different dates. This may be important for integrative analyses later, so we leave as is for now.

Creating an assay to document specimen collection, and available data

  • specimen_collection_swab (collected, not collected, unclear)
  • specimen_collection_softcup (collected, not collected, unclear)
  • specimen_collection_cytobrush (collected, not collected, unclear)
  • mg
  • qPCR
  • amplicon
  • luminex
  • flow
Code
sample_and_visit_info_assay <- 
  mae_coldata |> 
  select(uid, sample_type, in_CRF, in_assays, randomized) 

sample_and_visit_info_assay <- 
  sample_and_visit_info_assay |> 
  dplyr::left_join(
    visits |> select(uid, starts_with("specimen_collection")),
    by = join_by(uid)
  ) 

sample_and_visit_info_assay <-
  sample_and_visit_info_assay |>
  expand_grid(assay = assay_list$mae_assay_name) |>
  dplyr::left_join(
    available_samples |>
      select(uid, assay) |>
      mutate(value = TRUE),
    by = join_by(uid, assay)
  ) |>
  mutate(value = value |> replace_na(FALSE)) |>
  pivot_wider(names_from = assay, values_from = value)

# sample_and_visit_info_assay <- 
#   sample_and_visit_info_assay |> 
#   mutate(
#     mg_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & mg ~ "Not a clinical sample",
#         is.na(randomized) & mg ~ "???",
#         !is.na(randomized) & !randomized & mg ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & mg ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & mg ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !mg ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !mg ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     qPCR_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & qPCR ~ "Not a clinical sample",
#         is.na(randomized) & qPCR ~ "???",
#         !is.na(randomized) & !randomized & qPCR ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & qPCR ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & qPCR ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !qPCR ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !qPCR ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     amplicon_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & amplicon ~ "Not a clinical sample",
#         is.na(randomized) & amplicon ~ "???",
#         !is.na(randomized) & !randomized & amplicon ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & amplicon ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & amplicon ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !amplicon ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !amplicon ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     luminex_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & luminex ~ "Not a clinical sample",
#         is.na(randomized) & luminex ~ "???",
#         !is.na(randomized) & !randomized & luminex ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_softcup) & (specimen_collection_softcup != "not collected") & luminex ~ "Expected data",
#         (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & luminex ~ "Unexpected data",
#         !is.na(specimen_collection_softcup) & (specimen_collection_softcup == "collected") & !luminex ~ "Missing data",
#         (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & !luminex ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       )
#   )

# 
# The last 5 columns can take the following values:
# 
# - `expected` (swab collected and available data), 
# - `unexpected` (swab not collected but available data), 
# - `missing` (swab collected but no available data),
# - `maybe missing` (unclear if swab was collected and no available data),
# - `not collected` (swab not collected), 
# - `additional` (swab collected from a non-randomized participant), 
# - `NA` (not collected, no available data)
# 
Code
se_sample_and_visit_info_assay <- 
  SummarizedExperiment(
    assays = list(
      sample_and_visit_info = 
        sample_and_visit_info_assay |> 
        select(-c(sample_type, in_CRF, in_assays, randomized)) |> 
        as.data.frame() |> 
        column_to_rownames("uid") |> 
        t()
    ),
    colData = 
      sample_and_visit_info_assay |> select(uid) |> 
      mutate(rownames = uid) |> as.data.frame() |> 
      column_to_rownames("rownames")
  )

Creating the MultiAssayExperiment object

Code
SE_list <- 
  list(
    visit_and_sample_info = se_sample_and_visit_info_assay,
    mg = se_mg,
    qPCR = se_qPRC,
    qPCR_daily = se_daily_qPRC,
    amplicon = se_16S,
    luminex = se_luminex,
    flow = se_flow
  )


mae <- 
  MultiAssayExperiment::MultiAssayExperiment(
    experiments = MultiAssayExperiment::ExperimentList(SE_list),
    colData = 
      mae_coldata |> select(-in_CRF, -in_assays) |> 
      as.data.frame() |> mutate(rownames = uid) |> 
      column_to_rownames("rownames"),
    metadata = list(
      study = "VIBRANT",
      data_source = "actual study data (not simulated)",
      data_status = "partially QCed",
      date = today(),
      colData_dictionary = participants_variable_dictionary,
      crf_plates_description = crf_plates_dictionary,
      crf_data_raw = crf_raw,
      crf_data_clean = crf_clean, 
      participant_crfs_merged = participant_crfs_merged,
      visits_crfs_merged = visits_crfs_merged,
      exposures = exposures
    )
  )
Code
mae
A MultiAssayExperiment object of 7 listed
 experiments with user-defined names and respective classes.
 Containing an ExperimentList class object of length 7:
 [1] visit_and_sample_info: SummarizedExperiment with 9 rows and 5463 columns
 [2] mg: SummarizedExperiment with 779 rows and 996 columns
 [3] qPCR: SummarizedExperiment with 16 rows and 1029 columns
 [4] qPCR_daily: SummarizedExperiment with 16 rows and 2990 columns
 [5] amplicon: SummarizedExperiment with 817 rows and 4119 columns
 [6] luminex: SummarizedExperiment with 48 rows and 850 columns
 [7] flow: SummarizedExperiment with 11 rows and 356 columns
Functionality:
 experiments() - obtain the ExperimentList instance
 colData() - the primary/phenotype DataFrame
 sampleMap() - the sample coordination DataFrame
 `$`, `[`, `[[` - extract colData columns, subset, or experiment
 *Format() - convert into a long or wide DataFrame
 assays() - convert ExperimentList to a SimpleList of matrices
 exportClass() - save data to flat files

Available assays

Code
mae_sample_map <- 
  mae@sampleMap |> as_tibble() |> dplyr::filter(assay %in% assay_list$mae_assay_name) |>
  select(uid = primary, assay) |> 
  arrange(uid) |> 
  dplyr::left_join(assay_list |> dplyr::rename(assay = mae_assay_name), by = join_by(assay)) |> 
  dplyr::right_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      select(uid, pid, visit_code, PIPV, visit, visit_type, sample_type, location, randomized, arm), 
    by = join_by(uid)
    )

All samples

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  group_by(uid) |> mutate(n = n()) |> ungroup() |> 
  mutate(uid = uid |> fct_reorder(-n)) |>
  ggplot() +
  aes(x = uid, y = assay_label |> fct_rev(), fill = assay_label) +
  geom_tile() +
  facet_grid(. ~ sample_type, scales = "free_x", space = "free_x") +
  scale_x_discrete("Sample IDs", breaks = NULL) +
  ylab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Longitudinal availability for all visits

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  # filter(visit_code %in% c("0000", seq(1000, 1900, by = 100), "2120")) |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    ifelse(randomized, "Randomized", "Not randomized") + location ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

For the weekly visits of all participants

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(visit_type == "Clinic") |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + ifelse(randomized, "Randomized", "Not randomized")  + arm  ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

For the weekly visits of randomized participants

Code
mae_sample_map |> 
  dplyr::filter(randomized) |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(visit_type == "Clinic") |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + ifelse(randomized, "Randomized", "Not randomized")  + arm  ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Code
mae_sample_map |> 
  dplyr::filter(randomized) |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(PIPV) |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + arm  ~ visit, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Subsetting and saving the MultiAssayExperiment objects

We save the “master” MAE (with all assays); but if needed for publication, we can subset the MAE to only include the assays/data of interest.

Code
saveRDS(
  mae, 
  str_c(
    get_02_output_dir(),
    "mae_full_", today() |> str_remove_all("-"), ".rds"
  )
)